library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(ggthemes)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(nycflights13)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(anyflights)
library(ggmap)
## ℹ Google's Terms of Service: ]8;;https://mapsplatform.google.com<https://mapsplatform.google.com>]8;;
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
##
## wind
COVID-19 ravaged New York in 2020, with the entire city and metropolitan area living under tight quarantine controls to flatten the COVID curve as mortalities rose. 19 years previous, the loss of the World Trade Center due to terrorist attacks similarly left the entirety of New York stunned and in fear. Both of these major world events caused a major airline recession, with flights being cancelled and delayed as fear of planes meant air travel was used mostly for serious reasons.
COVID 19 and 9/11 are two major events that impacted the travel industry. We would like to investigate how each event altered flight travel, and how NYC travel was able to recover after these events.
To investigate cancellations and departure delays, we decided to choose only airports in NYC for years:
2001 to investigate the effects of 9/11
2013 as a control group
2020 to investigate the effect of COVID right as the pandemic hit the United States. We considered March 2020 to be the month COVID was officially declared as a pandemic in the United States.
aline <- get_airlines()
aport<- get_airports()
JFK2020<-get_flights(“JFK”, 2020)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))
JFK2013<-get_flights(“JFK”, 2013)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))
JFK2001<-get_flights(“JFK”, 2001)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))
LGA2020<-get_flights(“LGA”, 2020)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))
LGA2013<-get_flights(“LGA”, 2013)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))
LGA2001<-get_flights(“LGA”, 2001)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))
EWR2020<-get_flights(“EWR”, 2020)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))
EWR2013<-get_flights(“EWR”, 2013)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))
EWR2001<-get_flights(“EWR”, 2001)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))
df <- rbind(LGA2001,JFK2001,EWR2001,LGA2013,JFK2013,EWR2013,LGA2020,JFK2020,EWR2020)
write.csv(df, file=“finalflights.csv”)
df <- read.csv("finalflights.csv")
head(df)
## X year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## 1 1 2001 1 1 18 1947 271 308 2331
## 2 2 2001 1 1 542 534 8 825 836
## 3 3 2001 1 1 557 600 -3 655 725
## 4 4 2001 1 1 605 600 5 830 819
## 5 5 2001 1 1 606 600 6 740 755
## 6 6 2001 1 1 607 600 7 715 726
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## 1 217 US 309 N287A\xe4 LGA PBI 151 1035 19 47
## 2 -11 AA 115 N429A1 LGA DFW 201 1389 5 34
## 3 -30 UA 113 N359\xe41 LGA ORD 105 733 6 0
## 4 11 DL 281 N542D1 LGA ATL 105 761 6 0
## 5 -15 NW 143 N354N\xe6 LGA MEM 137 963 6 0
## 6 -11 AA 17 N518A1 LGA ORD 103 733 6 0
## time_hour name.x name.y lat_origin
## 1 2001-01-01 19:00:00 US Airways Inc. La Guardia Airport 40.7772
## 2 2001-01-01 05:00:00 American Airlines Inc. La Guardia Airport 40.7772
## 3 2001-01-01 06:00:00 United Air Lines Inc. La Guardia Airport 40.7772
## 4 2001-01-01 06:00:00 Delta Air Lines Inc. La Guardia Airport 40.7772
## 5 2001-01-01 06:00:00 Northwest Airlines Inc. La Guardia Airport 40.7772
## 6 2001-01-01 06:00:00 American Airlines Inc. La Guardia Airport 40.7772
## lon_origin alt_origin tz_origin dst_origin tzone_origin
## 1 -73.8726 21 -5 A America/New_York
## 2 -73.8726 21 -5 A America/New_York
## 3 -73.8726 21 -5 A America/New_York
## 4 -73.8726 21 -5 A America/New_York
## 5 -73.8726 21 -5 A America/New_York
## 6 -73.8726 21 -5 A America/New_York
## name lat_dest lon_dest alt_dest
## 1 Palm Beach International Airport 26.6832 -80.0956 19
## 2 Dallas Fort Worth International Airport 32.8968 -97.0380 607
## 3 Chicago O'Hare International Airport 41.9786 -87.9048 672
## 4 Hartsfield Jackson Atlanta International Airport 33.6367 -84.4281 1026
## 5 Memphis International Airport 35.0424 -89.9767 341
## 6 Chicago O'Hare International Airport 41.9786 -87.9048 672
## tz_dest dst_dest tzone_dest
## 1 -5 A America/New_York
## 2 -6 A America/Chicago
## 3 -6 A America/Chicago
## 4 -5 A America/New_York
## 5 -6 A America/Chicago
## 6 -6 A America/Chicago
First, we wanted to see the locations of destinations most traveled to originating from the NYC metropolitan area. To do this, we find volumes listed at each destination, and then overlay them with sizes corresponding over a US Map.
# Load the airport locations from the `airports` dataset
airport_locs <- aport %>%
select(lon, lat, faa, tz) %>%
rename(longitude = lon, latitude = lat)
# Join the flights dataset with the airport locations dataset
flights_with_locs <- df %>%
left_join(airport_locs, by = c("dest" = "faa"))
# Group the data by the destination airport and count the number of flights to each airport
flight_volume <- flights_with_locs %>%
group_by(dest, longitude, latitude) %>%
summarize(volume = n()) %>%
ungroup()
## `summarise()` has grouped output by 'dest', 'longitude'. You can override using
## the `.groups` argument.
# Get a map of the United States area
bbox <- c(bottom = 25.75, top = 49 , right = -67, left = -125)
usmap <- get_stamenmap(bbox = bbox, zoom = 4, maptype = 'toner-lite')
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
# Create a scatter plot of the data, with the size of the points representing flight volume
map <- ggmap(usmap) +
geom_point(data = flight_volume, aes(x = longitude, y = latitude, size = volume), alpha = 0.5) +
scale_size(range = c(2, 10)) +
labs(title = "Flight Volume to US Destination Airports",
subtitle = "Size of points represents flight volume",
size = "Flight Volume")
map
## Warning: Removed 3 rows containing missing values (geom_point).
In order to investigate average departure delay per month, we decided to see the distribution of departure delays per year. There was a large outlier effect considering some flight had delays that were hours long. For the rest of our analysis, we decided to only consider flights that had no more than a 3 hour delay.
plot_ly(df, x = ~dep_delay, type = "box")
## Warning: Ignoring 47282 observations
#Per Year
p2 <- plot_ly(df, x = ~year, y = ~dep_delay) %>%
add_boxplot(name = "year vs Departure Delay")
p2
## Warning: Ignoring 47282 observations
To investigate average delays per month, we first had to seperete the whole dataset by specific years. This is so each mean is calculated according to these years and not based off the whole dataset. As stated previously we also took out any flights that had a departure delay greater than 3 hours. You can see that average departure delays actually decreased during COVID. This means that travel may have been more efficient during COVID. 9/11 stayed around our control group of 2013. Although 2001 did have a lower average departure delay than 2013, it was nowhere near COVID which actually experienced negative average departure delay. You can also see that September 2001 had the highest average departure delay compared to September 2013 and 2020. In all, we can say that 2020 had the lowest average departure delay for every month.
delays2020<- df %>%
drop_na()%>%
filter(year==2020) %>%
filter(dep_delay<180)%>%
group_by(month)%>%
summarise(avg_dep_delay=mean(dep_delay, na.rm=TRUE))
delays2013<- df %>%
drop_na()%>%
filter(year==2013) %>%
filter(dep_delay<180)%>%
group_by(month)%>%
summarise(avg_dep_delay=mean(dep_delay, na.rm=TRUE))
delays2001<- df %>%
drop_na()%>%
filter(year==2001) %>%
filter(dep_delay<180)%>%
group_by(month)%>%
summarise(avg_dep_delay=mean(dep_delay, na.rm=TRUE))
delays<- data.frame(delays2001,delays2013,delays2020)
delaysfig <- plot_ly(delays, x = ~month, y = ~avg_dep_delay, name = '2001', type = 'scatter', mode = 'lines+markers')
delaysfig <- delaysfig %>% add_trace(x= ~month.1, y = ~avg_dep_delay.1, name = '2013', mode = 'lines+markers')
delaysfig <- delaysfig %>% add_trace(x= ~month.2, y = ~avg_dep_delay.2, name = '2020', mode = 'lines+markers')
delaysfig <- delaysfig %>% layout(title = "Average Departure Delay",
xaxis = list(title = "Month"),
yaxis = list (title = "Minutes"))
delaysfig
To find the number of cancellations that normally occur, we examine 2013 data, and then compare these to the years that were affected by disaster shocks. You can see that COVID had cancellations far greater than 9/11.
cancellations2020<- df %>%
filter(year==2020) %>%
group_by(month)%>%
summarise(cancelled=100*mean(is.na(dep_delay)))
cancellations2013<- df %>%
filter(year==2013) %>%
group_by(month)%>%
summarise(cancelled=100*mean(is.na(dep_delay)))
cancellations2001<- df %>%
filter(year==2001) %>%
group_by(month)%>%
summarise(cancelled=100*mean(is.na(dep_delay)))
cancellations<- data.frame(cancellations2001,cancellations2013,cancellations2020)
cancellationsfig <- plot_ly(cancellations, x = ~month, y = ~cancelled, name = '2001', type = 'scatter', mode = 'lines+markers')
cancellationsfig <- cancellationsfig %>% add_trace(x= ~month.1, y = ~cancelled.1, name = '2013', mode = 'lines+markers')
cancellationsfig <- cancellationsfig %>% add_trace(x= ~month.2, y = ~cancelled.2, name = '2020', mode = 'lines+markers')
cancellationsfig <- cancellationsfig %>% layout(title = "Percentage of Cancelled Flights",
xaxis = list(title = "Month"),
yaxis = list (title = "Percentage"))
cancellationsfig
We wanted to investigate whether amount of cancellations were varying across airports in NYC. We did this for years 2020 for COVID, 2013 as a control, and 2001 for 9/11. We calculated these cancellations as a proportion for that year so we can disregard the fact that airports may have different flight volumes. Each year, LGA seems to have the most cancellations. You can also see the shocks of cancellations that happened for each event. Using 2013 as our control, we can see that flight cancelations don’t exceed more than 5.5 percent for any airport for that year. This percentage is greatly higher in the years where the shocks happened. However, you can see recovery shortly after the shocks, with percentage of cancelled flights decreasing dramatically.
cancellations2020a<- df %>%
filter(year==2020) %>%
group_by(month, origin)%>%
summarise(cancelled=100*mean(is.na(dep_delay)))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
cancellations2013a <- df %>%
filter(year==2013) %>%
group_by(month, origin)%>%
summarise(cancelled=100*mean(is.na(dep_delay)))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
cancellations2001a <- df %>%
filter(year==2001) %>%
group_by(month, origin)%>%
summarise(cancelled=100*mean(is.na(dep_delay)))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
cancellationsa <- data.frame(cancellations2001a,cancellations2013a,cancellations2020a)
ggplot(cancellationsa, aes(x = month, y= cancelled, color = origin)) +
geom_line() + labs(title = "Percentage of Flights Cancelled Every Month",
x = "Month",
y = "Percentage of Flights")
ggplot(cancellationsa, aes(x = month.1, y= cancelled.1, color = origin.1)) +
geom_line()+ labs(title = "Percentage of Flights Cancelled Every Month",
x = "Month",
y = "Percentage of Flights")
ggplot(cancellationsa, aes(x = month.2, y= cancelled.2, color = origin.2)) +
geom_line()+ labs(title = "Percentage of Flights Cancelled Every Month",
x = "Month",
y = "Percentage of Flights")
To find out how the NYC airline industry eventually recovered, we took time series data from succeeding years after the shock to find how flight volumes eventually rebounded to pre-shock levels. We chose to only find recovery data for JFK and EWR because the planes involved in 9/11 departed from these airports.
recovery <- read.csv("Downloads/recovery.csv")
recovery2002<- recovery%>%
drop_na()%>%
filter(year == 2002) %>%
group_by(month) %>%
summarise(volume = n())
recovery2001<- df %>%
filter(origin %in% c("EWR", "JFK")) %>%
drop_na()%>%
filter(year == 2001) %>%
group_by(month) %>%
summarise(volume = n())
recoveryline<- data.frame(recovery2001,recovery2002)
recoveryfig <- plot_ly(recoveryline, x = ~month, y = ~volume, name = '2001', type = 'scatter', mode = 'lines+markers')
recoveryfig <- recoveryfig %>% add_trace(x= ~month.1, y = ~volume.1, name = '2002', mode = 'lines+markers')
recoveryfig <- recoveryfig %>% layout(title = "Volume of Flights Per Month After Shock",
xaxis = list(title = "Month"),
yaxis = list (title = "Volume"))
recoveryfig
To find out how the NYC airline industry eventually recovered, we took time series data from succeeding years after the shock to find how flight volumes eventually rebounded to pre-shock levels.
recovery2022<- recovery %>%
drop_na()%>%
filter(year == 2022) %>%
group_by(month) %>%
summarise(volume = n())
recovery2020<- df %>%
filter(origin %in% c("EWR", "JFK")) %>%
drop_na()%>%
filter(year == 2020) %>%
group_by(month) %>%
summarise(volume = n())
recoveryline2 <- data.frame(recovery2020,recovery2022)
recoveryfig2 <- plot_ly(recoveryline2, x = ~month, y = ~volume, name = '2020', type = 'scatter', mode = 'lines+markers')
recoveryfig2 <- recoveryfig2 %>% add_trace(x= ~month.1, y = ~volume.1, name = '2022', mode = 'lines+markers')
recoveryfig2 <- recoveryfig2 %>% layout(title = "Volume of Flights Per Month After Shock",
xaxis = list(title = "Month"),
yaxis = list (title = "Volume"))
recoveryfig2